Abstract
MSDS6306: Doing Data Science - Case Study 01: Beer & Brewery. I cleaned the data and performed EDA to visualize distribution (log transformed when necessary) as well as summary statistics. I conducted various hypothesis tests, k-NN cross validation on normalized values, linear regression model, and correlation tests to derive r, its p-value, r squared. Lastly, I examined the geographical distribution of IBU in order to recommend appropriate beer attributes that cater to the unmet needs of the regional markets.Load the required packages
library(tidyverse)
library(cowplot)
library(maps)
library(ggmap)
library(viridis)
library(plotly)
library(caret) #confusion matrix
library(e1071)
library(class) #knn.cv
Import the data
breweries = read_csv('https://raw.githubusercontent.com/BivinSadler/MSDS_6306_Doing-Data-Science/Master/Unit%208%20and%209%20Case%20Study%201/Breweries.csv')
beer = read_csv('https://raw.githubusercontent.com/BivinSadler/MSDS_6306_Doing-Data-Science/Master/Unit%208%20and%209%20Case%20Study%201/Beers.csv')
Breweries data has 558 observations and four columns:
dim(breweries)
## [1] 558 4
spec(breweries)
## cols(
## Brew_ID = col_double(),
## Name = col_character(),
## City = col_character(),
## State = col_character()
## )
Beer data has 2410 observations and seven columns:
dim(beer)
## [1] 2410 7
spec(beer)
## cols(
## Name = col_character(),
## Beer_ID = col_double(),
## ABV = col_double(),
## IBU = col_double(),
## Brewery_id = col_double(),
## Style = col_character(),
## Ounces = col_double()
## )
Merge datasets by primary key
df = merge(breweries, beer, by.x = 'Brew_ID', by.y = 'Brewery_id')
head(df)
Rename Name.x and Name.y
df = rename(df, Brewery = Name.x, Beer = Name.y)
head(df)
Clean the data:
cbind(lapply(lapply(df, is.na), sum))
## [,1]
## Brew_ID 0
## Brewery 0
## City 0
## State 0
## Beer 0
## Beer_ID 0
## ABV 62
## IBU 1005
## Style 5
## Ounces 0
final = df %>% na.exclude()
sum(as.numeric(is.na.data.frame(final)))
## [1] 0
Final dataset has 1403 observations and 10 columns:
1. Brew_ID: Unique identifier of the brewery
2. Brewery: Name of the brewery
3. City: City where brewery is located
4. State: U.S. state where brewery is located
5. Beer: Name of the beer
6. Beer_ID: Unique identifier of the beer
7. ABV: Alcohol by volume of the beer
8. IBU: International Bitterness Units of the beer
9. Ounces: Ounces of the beer
10. Style: Style of the beer
dim(final)
## [1] 1403 10
str(final)
## 'data.frame': 1403 obs. of 10 variables:
## $ Brew_ID: num 1 1 1 1 1 1 2 2 2 2 ...
## $ Brewery: chr "NorthGate Brewing" "NorthGate Brewing" "NorthGate Brewing" "NorthGate Brewing" ...
## $ City : chr "Minneapolis" "Minneapolis" "Minneapolis" "Minneapolis" ...
## $ State : chr "MN" "MN" "MN" "MN" ...
## $ Beer : chr "Pumpion" "Stronghold" "Parapet ESB" "Get Together" ...
## $ Beer_ID: num 2689 2688 2687 2692 2691 ...
## $ ABV : num 0.06 0.06 0.056 0.045 0.049 0.048 0.042 0.08 0.125 0.077 ...
## $ IBU : num 38 25 47 50 26 19 42 68 80 25 ...
## $ Style : chr "Pumpkin Ale" "American Porter" "Extra Special / Strong Bitter (ESB)" "American IPA" ...
## $ Ounces : num 16 16 16 16 16 16 16 16 16 16 ...
## - attr(*, "na.action")= 'exclude' Named int [1:1007] 19 35 36 37 38 39 40 41 42 43 ...
## ..- attr(*, "names")= chr [1:1007] "19" "35" "36" "37" ...
The first six and last rows of the dataframe
head(final)
tail(final)
There are 558 total breweries. I’ve plotted their distribution by state.
breweries %>% group_by(State) %>% tally(sort=T)
Plot into a heatmap
states = map_data('state')
st_abb = data.frame(abb = state.abb, region = tolower(state.name))
brew = breweries %>% group_by(State) %>% tally(sort=T)
brew = subset(brew, State != 'HI' & State != 'AK')
brew = rename(brew, abb = State, count = n)
brew = merge(brew, st_abb, by='abb')
geo = merge(states, brew, by='region', all.x=T)
geo = geo[order(geo$order),]
center <- data.frame(region=tolower(state.name), long=state.center$x, lat=state.center$y)
center <- merge(center, brew, by="region", all.x = TRUE)
ggplot(geo, aes(x=long,y=lat)) +
geom_polygon(aes(group=group, fill=count)) +
geom_text(data=center, aes(long, lat, label=count)) +
scale_fill_gradient(low = "slategray1",
high = "royalblue4",
guide = "colorbar") +
ggtitle("Breweries per State") +
coord_map() + theme(axis.title = element_text(size = 20)) + theme_void() + theme(legend.position = c(0.9, 0.4), plot.title = element_text(size= 20, hjust=0.1, margin = margin(b = -0.1, t = 0.4, l = 2, unit = "cm")))
I computed median ABV and IBU for each state. Maine has the highest median ABV at 6.7% as well as the highest median IBU at 61.
median = final %>% select(State, ABV, IBU) %>% group_by(State) %>% summarize(med_abv = median(ABV), med_ibu = median(IBU))
arrange(median, -med_abv)
arrange(median, -med_ibu)
Kentucky has the max ABV at 12.5%
max(final$ABV)
## [1] 0.125
grep(.125, final$ABV)
## [1] 9
final$State[9]
## [1] "KY"
Oregon has the highest IBU at 138
max(final$IBU)
## [1] 138
grep(138, final$IBU)
## [1] 1132
final$State[1132]
## [1] "OR"
I plotted median ABV for each state.
median %>% mutate(perc = med_abv*100, State = fct_reorder(State, perc)) %>% ggplot(aes(State, perc)) + geom_point(size=4, color="royalblue2") + coord_flip() + labs(y='ABV (%)', title='Median Alcohol by Volume of each State', subtitle = 'Decreasing Order') + theme_minimal() + theme(plot.title = element_text(size= 20), axis.title = element_text(size = 20))
Median IBU for each state.
median %>% mutate(State = fct_reorder(State, med_ibu)) %>% ggplot(aes(State, med_ibu)) + geom_point(size=4, color="indianred2") + coord_flip() + labs(y='IBU', title='Median International Bitterness Unit of each State', subtitle = 'Decreasing Order') + theme_minimal() + theme(plot.title = element_text(size= 20), axis.title = element_text(size = 20))
Summary statistics and distribution of ABV
| Statistic | Percent |
|---|---|
| Min | 2.7 |
| Med | 5.7 |
| Mean | 6 |
| Max | 12.5 |
| IQR | 1.8 |
Right skew distribution suggests lower production at increasing ABV. Consumer preference (and thus sales) drives demand to infer most lean towards lower concentration of ABV. The median value of 5.7% is a better indicator of the distribution because the mean is sensitive to outliers and gets pulled towards the skewed tail.
Here is the boxplot & density plot of ABV distribution.
summary(final$ABV)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.02700 0.05000 0.05700 0.05992 0.06800 0.12500
fivenum(final$ABV)
## [1] 0.027 0.050 0.057 0.068 0.125
IQR(final$ABV)
## [1] 0.018
final %>% mutate(abvperc = ABV*100) %>% ggplot(aes(abvperc)) + geom_boxplot(fill='royalblue', alpha=.3) + labs(x='ABV (%)', title='Distribution of Alcohol by Volume') + theme_cowplot() + theme(plot.title = element_text(size= 20), axis.ticks.y=element_blank(), axis.title.y=element_blank(), axis.text.y=element_blank(), legend.position = 'none', axis.title = element_text(size = 20)) + geom_segment(aes(x = 6, y = -.05, xend = 6, yend = .05, col='red'))
final %>% mutate(abvperc = ABV*100) %>% ggplot(aes(abvperc)) + geom_density(fill='indianred2', color='indianred3', alpha=.9) + geom_vline(xintercept=5.7, col='cornflowerblue') + geom_vline(xintercept = 6, linetype='dotted', col='paleturquoise3', size=1.1) + labs(x='ABV (%)', title='Distribution of Alcohol by Volume') + theme_minimal_hgrid() + theme(axis.title = element_text(size = 20), plot.title = element_text(size= 20))
Relationship between ABV and IBU using linear regression model.
I performed a log transformation of the abv values due to its skewness and conducted a linear regression model. Just from the eye test, the regression line depicts a positive, linear relationship. In fact, with an R2 of .45, it is estimated that 45% of the variation in IBU is explained by abv. Furthermore, there is strong evidence to suggest abv and IBU are linearly correlated (p-val <.00001).The ABV values were log transformed before plotting.
final = final %>% mutate(log_abv = log(ABV))
cor.test(x=final$log_abv, y=final$IBU)
##
## Pearson's product-moment correlation
##
## data: final$log_abv and final$IBU
## t = 34.032, df = 1401, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.6430317 0.7004025
## sample estimates:
## cor
## 0.6727271
model = lm(IBU~log_abv, data=final)
summary(model)
##
## Call:
## lm(formula = IBU ~ log_abv, data = final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -74.393 -12.410 -0.622 12.989 91.579
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 272.589 6.773 40.24 <2e-16 ***
## log_abv 80.972 2.379 34.03 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.22 on 1401 degrees of freedom
## Multiple R-squared: 0.4526, Adjusted R-squared: 0.4522
## F-statistic: 1158 on 1 and 1401 DF, p-value: < 2.2e-16
ABV vs IBU
final %>% ggplot(aes(x=log_abv, y=IBU)) + geom_point(color="black", fill="#69b3a2", shape=22, alpha=0.5, size=3, stroke = 1) + geom_line(aes(x = log_abv,y = model$fit), color = "lightpink3", lwd=.8) + labs(title='Relationship between Bitterness of Beer and its Alcohol Content', subtitle = 'Linear Regression Model on Log Transformed ABV', x= 'ABV (logged)', y='IBU') + theme_classic() + theme(axis.title = element_text(size = 20), plot.title = element_text(size= 20))
ABV vs. IBU by Style
View all 90 different styles
final %>% group_by(Style) %>% tally(sort=T)
Style df of just IPA/ale styles. 32 different styles of IPA and ale encompassing 944 observations and 10 columns
style = final %>% group_by(Style) %>% filter(grepl('\\sAle|IPA$', Style))
dim(style)
## [1] 944 11
Distinguish ale from ipa
all_style = final %>% group_by(Style)
keep = grep('\\sAle', all_style$Style)
ale = all_style[keep,]
rem = grep('India', ale$Style)
ale = ale[-rem,] #552 obs, 27 ale styles
ale$Style = 'ale'
ipa = grep(('\\sIndia\\sPale\\sAle|IPA$'), all_style$Style)
ipa = all_style[ipa,] # 392 obs, 5 ipa styles
ipa$Style = 'ipa'
ipa_ale = rbind(ipa, ale)
ipa_ale$Style = as.factor(ipa_ale$Style)
Code the plots
pmain = ipa_ale %>% ggplot(aes(x=ABV, y=IBU, shape=factor(Style))) + geom_point(aes(color=factor(Style))) + theme(legend.position = 'none', axis.title = element_text(size = 20), panel.background = element_rect(fill = "gray97", color = NA))
xdens = axis_canvas(pmain, axis='x') + geom_density(data=ipa_ale, aes(ABV, fill=Style), alpha=.7, size=.2)
ydens = axis_canvas(pmain, axis='y', coord_flip = TRUE) + geom_density(data=ipa_ale, aes(IBU, fill=Style), alpha=.7, size=.2) + coord_flip()
p1 = insert_xaxis_grob(pmain, xdens, grid::unit(.2, 'null'), position='top')
p2 = insert_yaxis_grob(p1, ydens, grid::unit(.2, 'null'), position='right')
Plot scatterplot with marginal plot on the axis
ggdraw(p2) + draw_label('ABV vs. IBU by Style', x=.12, y=.98, size=20) + draw_label('IPA', x=.52, y=.95, size=13, color='paleturquoise4', fontface = 'bold') + draw_label('Ale', x=.35, y=.97, size=13, color='rosybrown', fontface = 'bold')
Conduct KNN classifier to investigate the difference with respect to ABV and IBU between IPAs and other types of Ale. I normalized both variables and performed a knn cross validation test with a k of 11. The model achieved a 78% accuracy (how well it correctly identified the style), almost 84% sensitivity (how many ales did the model correctly identify), and 70% specificity (how many ipa did the model correctly identify). Additionally, I log transformed abv and ibu conducted a Welch’s two sample t-test to conclude that there is strong evidence to suggest the true difference in both median abv and ibu value is different for ale and ipa.
normalize = function(x) {
return ((x - min(x)) / (max(x) - min(x)))
}
ipa_ale = ipa_ale %>% mutate(norm_abv = normalize(ABV), norm_ibu = normalize(IBU))
model = knn.cv(ipa_ale[,c(12:13)], ipa_ale$Style, k=11)
confusionMatrix(model, ipa_ale$Style)
## Confusion Matrix and Statistics
##
## Reference
## Prediction ale ipa
## ale 459 117
## ipa 93 275
##
## Accuracy : 0.7775
## 95% CI : (0.7496, 0.8037)
## No Information Rate : 0.5847
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.5378
##
## Mcnemar's Test P-Value : 0.1125
##
## Sensitivity : 0.8315
## Specificity : 0.7015
## Pos Pred Value : 0.7969
## Neg Pred Value : 0.7473
## Prevalence : 0.5847
## Detection Rate : 0.4862
## Detection Prevalence : 0.6102
## Balanced Accuracy : 0.7665
##
## 'Positive' Class : ale
##
Conduct t.test
ipa_ale = ipa_ale %>% mutate(logabv=log(ABV), logibu=log(IBU))
t.test(logabv~Style, data=ipa_ale)
##
## Welch Two Sample t-test
##
## data: logabv by Style
## t = -16.91, df = 849.95, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group ale and group ipa is not equal to 0
## 95 percent confidence interval:
## -0.2260548 -0.1790364
## sample estimates:
## mean in group ale mean in group ipa
## -2.889941 -2.687395
t.test(logibu~Style, data=ipa_ale)
##
## Welch Two Sample t-test
##
## data: logibu by Style
## t = -30.993, df = 875.42, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group ale and group ipa is not equal to 0
## 95 percent confidence interval:
## -0.8911237 -0.7849819
## sample estimates:
## mean in group ale mean in group ipa
## 3.399569 4.237622
Portland has the highest number of breweries with 39 and its max IBU is 103. Similarly, SD has the third highest number of breweries with 35 and its max IBU is 115. I’ve only given you two examples (due to time constraint), but there is a strong linear correlation between breweries per city and its maximum IBU (correlation coefficient of .79 and p-value of less than .00001). Additionally, it is estimated that 63% of the variation in max IBU is explained by its linear relationship with breweries per city. This leads me to infer cities with a high number of breweries have a more developed palate for crafty beers that have high IBU.
Why is this important? Boston is ranked 12th in breweries per city yet its maximum IBU is only 45. That’s 50% less than the max IBU compared to cities with similar demographics. In fact, if you take the max IBU of all the cities in MA and extract the median, Boston is still 30 IBU lower. I believe that Boston is an untapped market with a need for higher IBU beer production.
IBU distribution on US map (static plot)
states = map_data('state')
usabeer = final %>% select(City, State, ABV, IBU)
usabeer = subset(usabeer, State != 'HI' & State != 'AK')
usabeer$citystate = str_c(usabeer$City, ", ", usabeer$State)
usabeer = cbind(geocode(as.character(usabeer$citystate)), usabeer)
usabeer = usabeer %>% arrange(IBU) %>% mutate(cs=factor(citystate, unique(citystate)))
ggplot() + geom_polygon(data=states, aes(x=long, y=lat, group=group), fill='grey', alpha=.3) + geom_point(data=usabeer, aes(lon, y=lat, size=IBU, color=IBU, alpha=IBU), shape=20) + scale_size_continuous(name='IBU', range=c(1,10), breaks=c(1, 25, 50, 75, 100)) + scale_alpha_continuous(name='IBU', range=c(.1, .9), breaks=c(1, 25, 50, 75, 100)) + scale_color_viridis(option='magma', direction=-1, breaks=c(1, 25, 50, 75, 100), name='IBU') + theme_void() + coord_map() + borders('state') + guides( colour = guide_legend()) +
ggtitle("Distribution of International Bitterness Unit by Cities") +
theme(
legend.position = c(0.85, 0.25),
text = element_text(color = "#22211d"),
plot.background = element_rect(fill = "#f5f5f2", color = NA),
panel.background = element_rect(fill = "#f5f5f2", color = NA),
legend.background = element_rect(fill = "#f5f5f2", color = NA),
plot.title = element_text(size= 20, hjust=0.1, margin = margin(b = -.5, t = 0.4, l = 2, unit = "cm")),
)
Plot IBU distribution
summary(final$IBU)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.00 21.00 35.00 42.74 64.00 138.00
usabeer %>% ggplot(aes(IBU)) + geom_density(fill='royalblue', alpha=.7) + geom_vline(xintercept=35, col='red') + geom_vline(xintercept = 42.74, linetype='dotted', col='turquoise', size=1.1) + labs(title='International Bitterness Unit') + theme_minimal_hgrid() + theme(axis.title = element_text(size = 20), plot.title = element_text(size= 20))
IBU distribution on US map (PLOTLY)
usabeer = usabeer %>% mutate( mytext=paste(citystate, "\n", "IBU: ", IBU, sep=""))
p.usa = ggplot() + geom_polygon(data=states, aes(x=long, y=lat, group=group)) + geom_point(data=usabeer, aes(x=lon, y=lat, size=IBU, color=IBU, alpha=IBU, text=mytext)) + scale_size_continuous(range=c(1,7)) + scale_color_viridis(option='inferno', direction = -1) + theme_void() + coord_map() + borders('state', fill='whitesmoke', alpha=.3) + labs(title='Distribution of International Bitterness Unit by Cities') + theme(legend.position = 'none', plot.title = element_text(size=25, hjust=.5))
p.usa = ggplotly(p.usa, tooltip='text')
p.usa
Number of breweries per city in descending order
nbrew = final %>% select(IBU, City, State) %>% group_by(City, State) %>% tally(sort=T)
nbrew
MAX IBU for each city
maxibu = final %>% select(IBU, City, State) %>% group_by(State, City) %>% slice(which.max(IBU))
maxibu = arrange(maxibu, -IBU)
maxibu
merge, conduct cor test
ibu.brew.cor = as.data.frame(cbind(maxibu$IBU, nbrew$n))
ibu.brew.cor = rename(ibu.brew.cor, max_ibu=V1, n_brew = V2)
cor(ibu.brew.cor) # .7939
## max_ibu n_brew
## max_ibu 1.0000000 0.7938968
## n_brew 0.7938968 1.0000000
cor.test(x=ibu.brew.cor$n_brew, y=ibu.brew.cor$max_ibu, method = 'pearson')
##
## Pearson's product-moment correlation
##
## data: ibu.brew.cor$n_brew and ibu.brew.cor$max_ibu
## t = 22.196, df = 289, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7471146 0.8328525
## sample estimates:
## cor
## 0.7938968
ma = maxibu %>% select(State, IBU) %>% filter(State == 'MA') %>% summarize(value = IBU)
## Adding missing grouping variables: `City`
## `summarise()` has grouped output by 'State'. You can override using the `.groups` argument.
mean(ma$value)
## [1] 68.09091
fivenum(ma$value)
## [1] 35.0 45.0 69.0 82.5 130.0